#!/usr/bin/Rscript
##########################################################################################
# Social Media and Policy Responses to the COVID-19 Pandemic in Switzerland
##########################################################################################
# Description:
##########################################################################################
# Data Transformation for VAR Model 
##########################################################################################
# Contents
##########################################################################################
# 1) Dependencies
# 2) Load Data
# 3) Transform Data
## 3.1) Recode Paper Names in so_txt since there are errors in the smd database
## 3.2) Get Data into Shape
# 4) Save Data 
##########################################################################################
# 1) Dependencies
##########################################################################################
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(readr))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(purrr))
suppressPackageStartupMessages(library(magrittr))
suppressPackageStartupMessages(library(vars))
suppressPackageStartupMessages(library(boot))
##########################################################################################
# 2) Load Data
##########################################################################################
rm(list=ls())
# - set dir
args = commandArgs()

scriptName = args[substr(args,1,7) == '--file=']

if (length(scriptName) == 0) {
  scriptName <- rstudioapi::getSourceEditorContext()$path
} else {
  scriptName <- substr(scriptName, 8, nchar(scriptName))
}

pathName = substr(
  scriptName, 
  1, 
  nchar(scriptName) - nchar(strsplit(scriptName, '.*[/|\\]')[[1]][2])
)

setwd(pathName)
parent_path <- getwd()

# - define a global seed (used in all scripts)
set.seed(2019)

# - remove re-tweets (TRUE)
retweet_out <- TRUE

# - load data:
tdf <- readRDS("../data/Twitter_data_minified.RDS") %>% dplyr::mutate(Datum = as.Date(Datum))
sdf <- readRDS("../data/SMD_CDT_data_minified.RDS")
fdf <- readRDS("../data/Facebook_data_minified.RDS")%>% dplyr::mutate(Datum = as.Date(Datum))

# - factorize topic
tdf$topic <- factor(tdf$topic, levels = c("Covid19","Masks","App","App & Masks","Anderes"))
sdf$topic <- factor(sdf$topic, levels = c("Covid19","Masks","App","App & Masks","Anderes"))
fdf$topic <- factor(fdf$topic, levels = c("Covid19","Masks","App","App & Masks","Anderes"))

# - set max and min date:
range(tdf$Datum)
range(sdf$pubDateTime)
range(fdf$Datum)

fdf <- fdf %>% dplyr::filter(Datum > as.Date("2020-02-28") & Datum < as.Date("2020-08-23"))
sdf <- sdf %>% dplyr::filter(pubDateTime > as.Date("2020-02-28") & pubDateTime < as.Date("2020-08-23"))
tdf <- tdf %>% dplyr::filter(Datum > as.Date("2020-02-28") & Datum < as.Date("2020-08-23"))
# - check if date-ranges are all equal
range(tdf$Datum)
range(sdf$pubDateTime)
range(fdf$Datum)

fdf <- fdf %>% filter(la == "de")
sdf <- sdf %>% filter(la == "de")
tdf <- tdf %>% filter(la == "de") 

# Write out files for top feature extraction...
unique(tdf$topic)

tdf_app <- tdf %>% filter(topic == "App")
tdf_mas <- tdf %>% filter(topic == "Masks")
tdf_cov <- tdf %>% filter(topic == "Covid19")

saveRDS(tdf_app, "../data/Twitter_app.RDS")
saveRDS(tdf_mas, "../data/Twitter_mask.RDS")
saveRDS(tdf_cov, "../data/Twitter_covid.RDS")

fdf_app <- fdf %>% filter(topic == "App")
fdf_mas <- fdf %>% filter(topic == "Masks")
fdf_cov <- fdf %>% filter(topic == "Covid19")

saveRDS(fdf_app, "../data/Facebook_app.RDS")
saveRDS(fdf_mas, "../data/Facebook_mask.RDS")
saveRDS(fdf_cov, "../data/Facebook_covid.RDS")

rm(tdf_app,tdf_cov,tdf_mas, fdf_cov, fdf_app, fdf_mas)
setwd(parent_path)
##########################################################################################
# 3) Transform Data
##########################################################################################
## 3.1) Some minor transformations
##########################################################################################
# rename paper names (so_txt)
sdf <- sdf %>% 
  mutate(so_txt = case_when(
    so_txt %in% c("20 minuten online", "20 minutes","20 minuti") ~ "20 minuten", 
    so_txt %in% c("Newsnet / 24 heures") ~ "24 heures",
    so_txt %in% c("Newsnet / Basler Zeitung") ~ "Basler Zeitung",
    so_txt %in% c("Newsnet / Berner Zeitung") ~ "Berner Zeitung",
    so_txt %in% c("Newsnet / Der Bund") ~ "Der Bund",
    so_txt %in% c("Newsnet / Le Matin") ~ "Le Matin",
    so_txt %in% c("Newsnet / Tribune de Genève", "Tribune de Genève") ~ "Tribune de Genève",
    so_txt %in% c("Newsnet / Tages-Anzeiger") ~ "Tages-Anzeiger",
    so_txt %in% c("Handelszeitung online") ~ "Handelszeitung",
    so_txt %in% c("rts.ch", "RTS.ch") ~ "srf.ch",
    so_txt %in% c("SWI swissinfo.ch") ~ "swissinfo.ch",
    so_txt %in% c("Finanz und Wirtschaft Online") ~ "Finanz und Wirtschaft",
    so_txt %in% c("Anzeigen von Uster", "Anzegier von Uster") ~ "Anzeiger von Uster",
    so_txt %in% c("L'Agefi") ~ "Agefi",
    so_txt %in% c("Aargauer Zeitung", "Aargauer Zeitung / MLZ") ~ "Aargauer Zeitung",
    so_txt %in% c("Migros-Magazin", "Migros Magazine") ~ "Migros-Magazin",
    so_txt %in% c("Cooperazione", "Coopzeitung", "Coopération") ~ "Coopzeitung",
    so_txt %in% c("L'Express / L'Impartial", "Arcinfo") ~ "Arcinfo",
    TRUE ~ so_txt
  ))

tdf <- tdf %>% mutate(Party = tolower(Party)) %>%
  dplyr::filter(Party %in% c("alternative - die grünen zug", "alternative-die grünen kanton zug", "bürgerlich-demokratische partei schweiz", "christlich-soziale partei",
                             "christlichdemokratische volkspartei der schweiz", "christlichdemokratische volkspartei oberwallis", "fdp.die liberalen", 
                             "grüne (basels starke alternative)", "grüne partei der schweiz", "grünliberale partei", "na", "nd", "schweizerische volkspartei",
                             "sozialdemokratische partei der schweiz", NA)) %>% 
  dplyr::mutate(Party = case_when(Party %in% c("grüne (basels starke alternative)", 
                                               "grüne partei der schweiz", 
                                               "alternative - die grünen zug",
                                               "alternative-die grünen kanton zug") ~ "Grüne",
                                  Party %in% c("sozialdemokratische partei der schweiz") ~ "SP",
                                  Party %in% c("schweizerische volkspartei") ~ "SVP",
                                  Party %in% c("fdp.die liberalen") ~ "FDP",
                                  Party %in% c("christdemokratische volkspartei der schweiz",
                                               "christlichdemokratische volkspartei der schweiz",
                                               "christlich-soziale partei", 
                                               "christlichdemokratische volkspartei oberwallis",
                                               "christlichsoziale volkspartei oberwallis") ~ "CVP",
                                  Party %in% c("grünliberale partei") ~ "GLP",
                                  Party %in% c("bürgerlich-demokratische partei schweiz") ~"BDP",
                                  Party %in% c("na", "nd") ~ "NA",
                                  TRUE ~ Party))

tdf <- tdf %>% mutate(Party = ifelse(Party %in% c("NA") == T, NA, Party))
# - remove tweets from actors we are not interested in:
tdf <- tdf %>% dplyr::filter(!Akteur.Typ %in% c("Institute","Gericht", "NA")) %>%
  dplyr::filter((!Akteur.Typ %in% c("Party","Person") & is.na(Party) == F)==F) %>% 
  dplyr::mutate(Akteur.Typ = as.character(Akteur.Typ))%>% 
  dplyr::mutate(Akteur.Typ = ifelse(is.na(Akteur.Typ), "ND", Akteur.Typ))


# - remove all retweets from tweets
if(retweet_out == T){
  tdf <- tdf %>% dplyr::filter(Is_retweet != T)
  unique(tdf$Is_retweet) # - check if all is  in order
}

gc()

# - check topics
sdf %>% group_by(topic) %>% summarise(n =n())
##########################################################################################
## 3.2) Get Data into Shape
##########################################################################################
# Transform SMD Data: 
smd_ana <- sdf %>% group_by(pubDateTime, topic) %>% 
  summarise(n = n()) %>%
  mutate(freq = n / sum(n))

# Transform CrowdTangle (Facebook) Data:
fdf_ana <- fdf %>% dplyr::mutate(Akteur_Art = ifelse(Akteur.Typ == "Party", "Party",
                                                     ifelse(Akteur.Typ == "Media", "Media", "Politican"))) %>%
  #dplyr::filter(Akteur.Typ != "Media") %>%
  dplyr::group_by(Datum, topic, Akteur_Art) %>% 
  summarise(n = n()) %>% ungroup() %>% dplyr::group_by(Akteur_Art, Datum) %>% 
  dplyr::mutate(freq = n / sum(n))

# Transform Twitter Data 
tdf_ana <- tdf %>% as.data.frame() %>%
  dplyr::mutate(Akteur.Typ = as.character(Akteur.Typ)) %>% 
  #dplyr::filter(Akteur.Typ != "Media") %>%
  dplyr::mutate(Akteur_Art = ifelse(Akteur.Typ == "Party", "Party",
                                    ifelse(Akteur.Typ == "Media", "Media", 
                                           ifelse(Akteur.Typ == "Person", "Politican", 
                                                  ifelse(Akteur.Typ %in% c("Administration","Departement","Bundesamt"), "Gov",
                                                         ifelse(Akteur.Typ %in% c("Organisation","Komitee"), "Org", 
                                                                ifelse(Akteur.Typ =="ND", "SnowBallers", "NA"))))))) %>%
  dplyr::group_by(Datum, topic, Akteur_Art) %>%
  summarise(n = n()) %>% ungroup() %>% dplyr::group_by(Akteur_Art, Datum) %>% 
  dplyr::mutate(freq = n / sum(n))

# - add missing dates with NA not 0 
# - will be changed later to 0.01 befroe loglink transformation to avoid error
smd_ana %<>% ungroup()%>% mutate(pubDateTime = as.Date(pubDateTime)) %>%
  tidyr::complete(pubDateTime = seq.Date(min(pubDateTime), max(pubDateTime), by = "day"), topic)

fdf_ana %<>% ungroup()%>% 
  rename(pubDateTime = Datum) %>%
  mutate(pubDateTime = as.Date(pubDateTime)) %>%
  tidyr::complete(pubDateTime = seq.Date(min(pubDateTime), max(pubDateTime), by = "day"), topic, Akteur_Art)

tdf_ana %<>% ungroup() %>% 
  rename(pubDateTime = Datum) %>%
  mutate(pubDateTime = as.Date(pubDateTime)) %>%
  tidyr::complete(pubDateTime = seq.Date(min(pubDateTime), max(pubDateTime), by = "day"), topic, Akteur_Art)

# - spread dataframe with more than one grouping variable besides selectsclass
fdf_ana_wide <- fdf_ana %>% dplyr::select(-c("n")) %>% 
  tidyr::spread(Akteur_Art, freq, fill = NA)

tdf_ana_wide <- tdf_ana  %>% dplyr::select(-c("n")) %>% 
  tidyr::spread(Akteur_Art, freq, fill = NA)

smd_ana_wide <- smd_ana %>% dplyr::select(-c("n")) %>% rename(Media_SMD = freq)

# Remove empty Columns from wider 
tdf_ana_wide <- as.data.table(tdf_ana_wide)
tdf_ana_wide <- tdf_ana_wide[,which(unlist(lapply(tdf_ana_wide, function(x)!all(is.na(x))))),with=F]

fdf_ana_wide <- as.data.table(fdf_ana_wide)
fdf_ana_wide <- fdf_ana_wide[,which(unlist(lapply(fdf_ana_wide, function(x)!all(is.na(x))))),with=F]

# - rename Columns for merge...
colnames(tdf_ana_wide) <- c("pubDateTime", "topic", "Gov_TW", "Media_TW", "Org_TW", "Party_TW", "Politican_TW", "SnowBallers_TW")

colnames(fdf_ana_wide) <- c("pubDateTime", "topic", "Media_FB", "Party_FB", "Politican_FB")

names(smd_ana_wide)
# - merge the three data frames:
ana_data_wide <- left_join(smd_ana_wide, fdf_ana_wide, by = c("pubDateTime", "topic"))
ana_data_wide <- left_join(ana_data_wide, tdf_ana_wide, by = c("pubDateTime", "topic"))
##########################################################################################
# 4) Save Data 
##########################################################################################
saveRDS(ana_data_wide, "../data/main_data_for_paper_de_only.RDS")

